IMPORT EVERYTHING

In [1]:
# Commands to install some of the libraries in-case if they are not installed
# Any other library that needs to be installed just use: !pip install <library name>
# !pip install seaborn
# !pip install missingno
# !pip install xgboost
# !pip install catboost
# !pip install regex
# !pip install sklearn
# !pip install pandas
# !pip install numpy
# !pip install imblearn
# pip install lightgbm
# !pip install --upgrade matplotlib
In [2]:
import pandas as pd   # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np   # linear algebra
import matplotlib.pyplot as plt  #graphs and plots
import seaborn as sns   #data visualizations 
import csv # Some extra functionalities for csv  files - reading it as a dictionary
from lightgbm import LGBMClassifier #sklearn is for machine learning and statistical modeling including classification, regression, clustering and dimensionality reduction 

from sklearn.model_selection import train_test_split, cross_validate   #break up dataset into train and test sets

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler


# importing python library for working with missing data
import missingno as msno
# To install missingno use: !pip install missingno
import re    # This library is used to perform regex pattern matching

# import various functions from sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, classification_report, make_scorer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

Import additional items as needed...

In [3]:
from sklearn.model_selection import KFold,cross_val_score, RepeatedStratifiedKFold,StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.compose import make_column_transformer
from imblearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,\
                            precision_score, recall_score, roc_auc_score,\
                            ConfusionMatrixDisplay, classification_report, RocCurveDisplay, f1_score
from sklearn.linear_model import LinearRegression
import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings("ignore")

Exploratory Data Analysis (EDA)¶

Start with Loading the CSV Data¶

In [4]:
diabetic = pd.read_csv('D:/GitHub/HHA550_Analysis/finaldata/cleaned_diabetic_data_final_presentation1.csv')
In [5]:
diabetic.head()
Out[5]:
Unnamed: 0 race gender age admission_type_id discharge_disposition_id admission_source_id time_in_hospital num_lab_procedures num_procedures ... citoglipton insulin glyburide-metformin glipizide-metformin glimepiride-pioglitazone metformin-rosiglitazone metformin-pioglitazone change diabetesMed readmitted
0 1 3 0 1 1 1 7 3 59 0 ... 0 3 0 0 0 0 0 1 1 0
1 2 1 0 2 1 1 7 2 11 5 ... 0 0 0 0 0 0 0 0 1 0
2 3 3 1 3 1 1 7 2 44 1 ... 0 3 0 0 0 0 0 1 1 0
3 4 3 1 4 1 1 7 1 51 0 ... 0 2 0 0 0 0 0 1 1 0
4 5 3 1 5 2 1 2 3 31 6 ... 0 2 0 0 0 0 0 0 1 0

5 rows × 44 columns

Insights into our target variable¶

In [6]:
y = diabetic['readmitted']
print(f'Percentage of patient who readmitted within 30 days: % {round(y.value_counts(normalize=True)[1]*100,2)} --> ({y.value_counts()[1]} patient)\nPercentage of patient did not readmit within 30 days: % {round(y.value_counts(normalize=True)[0]*100,2)} --> ({y.value_counts()[1]} patient)')
Percentage of patient who readmitted within 30 days: % 11.23 --> (9486 patient)
Percentage of patient did not readmit within 30 days: % 88.77 --> (9486 patient)
In [7]:
fig = px.histogram(diabetic, x="readmitted", title='Readmitted', width=400, height=400)
fig.show()

There is an imbalance in the data regarding patient readmission within 30 days.

Since there is an imbalance, some metrics such as accuracy give us misleading results.

Check For Missing Values¶

In [8]:
msno.matrix(diabetic)
Out[8]:
<Axes: >

No missing values.

Decide on Metrics¶

Since the target data is skewed, the best metric for this binary classification problem would be Area Under the ROC Curve (AUC).

In [9]:
diabetic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84447 entries, 0 to 84446
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Unnamed: 0                84447 non-null  int64
 1   race                      84447 non-null  int64
 2   gender                    84447 non-null  int64
 3   age                       84447 non-null  int64
 4   admission_type_id         84447 non-null  int64
 5   discharge_disposition_id  84447 non-null  int64
 6   admission_source_id       84447 non-null  int64
 7   time_in_hospital          84447 non-null  int64
 8   num_lab_procedures        84447 non-null  int64
 9   num_procedures            84447 non-null  int64
 10  num_medications           84447 non-null  int64
 11  number_outpatient         84447 non-null  int64
 12  number_emergency          84447 non-null  int64
 13  number_inpatient          84447 non-null  int64
 14  diag_1                    84447 non-null  int64
 15  number_diagnoses          84447 non-null  int64
 16  max_glu_serum             84447 non-null  int64
 17  A1Cresult                 84447 non-null  int64
 18  metformin                 84447 non-null  int64
 19  repaglinide               84447 non-null  int64
 20  nateglinide               84447 non-null  int64
 21  chlorpropamide            84447 non-null  int64
 22  glimepiride               84447 non-null  int64
 23  acetohexamide             84447 non-null  int64
 24  glipizide                 84447 non-null  int64
 25  glyburide                 84447 non-null  int64
 26  tolbutamide               84447 non-null  int64
 27  pioglitazone              84447 non-null  int64
 28  rosiglitazone             84447 non-null  int64
 29  acarbose                  84447 non-null  int64
 30  miglitol                  84447 non-null  int64
 31  troglitazone              84447 non-null  int64
 32  tolazamide                84447 non-null  int64
 33  examide                   84447 non-null  int64
 34  citoglipton               84447 non-null  int64
 35  insulin                   84447 non-null  int64
 36  glyburide-metformin       84447 non-null  int64
 37  glipizide-metformin       84447 non-null  int64
 38  glimepiride-pioglitazone  84447 non-null  int64
 39  metformin-rosiglitazone   84447 non-null  int64
 40  metformin-pioglitazone    84447 non-null  int64
 41  change                    84447 non-null  int64
 42  diabetesMed               84447 non-null  int64
 43  readmitted                84447 non-null  int64
dtypes: int64(44)
memory usage: 28.3 MB

There are no missing values. Every column has 83254 non-null values.

Numerical Features¶

Look at the data elements using diabetic_head()
Look at the dtype using diabetic.info()
In [10]:
diabetic.head()
Out[10]:
Unnamed: 0 race gender age admission_type_id discharge_disposition_id admission_source_id time_in_hospital num_lab_procedures num_procedures ... citoglipton insulin glyburide-metformin glipizide-metformin glimepiride-pioglitazone metformin-rosiglitazone metformin-pioglitazone change diabetesMed readmitted
0 1 3 0 1 1 1 7 3 59 0 ... 0 3 0 0 0 0 0 1 1 0
1 2 1 0 2 1 1 7 2 11 5 ... 0 0 0 0 0 0 0 0 1 0
2 3 3 1 3 1 1 7 2 44 1 ... 0 3 0 0 0 0 0 1 1 0
3 4 3 1 4 1 1 7 1 51 0 ... 0 2 0 0 0 0 0 1 1 0
4 5 3 1 5 2 1 2 3 31 6 ... 0 2 0 0 0 0 0 0 1 0

5 rows × 44 columns

In [11]:
diabetic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84447 entries, 0 to 84446
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Unnamed: 0                84447 non-null  int64
 1   race                      84447 non-null  int64
 2   gender                    84447 non-null  int64
 3   age                       84447 non-null  int64
 4   admission_type_id         84447 non-null  int64
 5   discharge_disposition_id  84447 non-null  int64
 6   admission_source_id       84447 non-null  int64
 7   time_in_hospital          84447 non-null  int64
 8   num_lab_procedures        84447 non-null  int64
 9   num_procedures            84447 non-null  int64
 10  num_medications           84447 non-null  int64
 11  number_outpatient         84447 non-null  int64
 12  number_emergency          84447 non-null  int64
 13  number_inpatient          84447 non-null  int64
 14  diag_1                    84447 non-null  int64
 15  number_diagnoses          84447 non-null  int64
 16  max_glu_serum             84447 non-null  int64
 17  A1Cresult                 84447 non-null  int64
 18  metformin                 84447 non-null  int64
 19  repaglinide               84447 non-null  int64
 20  nateglinide               84447 non-null  int64
 21  chlorpropamide            84447 non-null  int64
 22  glimepiride               84447 non-null  int64
 23  acetohexamide             84447 non-null  int64
 24  glipizide                 84447 non-null  int64
 25  glyburide                 84447 non-null  int64
 26  tolbutamide               84447 non-null  int64
 27  pioglitazone              84447 non-null  int64
 28  rosiglitazone             84447 non-null  int64
 29  acarbose                  84447 non-null  int64
 30  miglitol                  84447 non-null  int64
 31  troglitazone              84447 non-null  int64
 32  tolazamide                84447 non-null  int64
 33  examide                   84447 non-null  int64
 34  citoglipton               84447 non-null  int64
 35  insulin                   84447 non-null  int64
 36  glyburide-metformin       84447 non-null  int64
 37  glipizide-metformin       84447 non-null  int64
 38  glimepiride-pioglitazone  84447 non-null  int64
 39  metformin-rosiglitazone   84447 non-null  int64
 40  metformin-pioglitazone    84447 non-null  int64
 41  change                    84447 non-null  int64
 42  diabetesMed               84447 non-null  int64
 43  readmitted                84447 non-null  int64
dtypes: int64(44)
memory usage: 28.3 MB

Seperate Categorical and Numerical elements

In [12]:
categorical = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 
               'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 
               'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 
               'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 
               'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
               'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']

numerical = ['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
                      'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
In [13]:
diabetic[numerical].describe()
Out[13]:
age time_in_hospital num_lab_procedures num_procedures num_medications number_outpatient number_emergency number_inpatient number_diagnoses
count 84447.000000 84447.000000 84447.000000 84447.000000 84447.000000 84447.000000 84447.000000 84447.000000 84447.000000
mean 6.084467 4.382737 43.656234 1.340107 15.997833 0.356354 0.208521 0.652386 7.518538
std 1.604887 2.977027 19.390216 1.708996 8.121821 1.257435 0.981372 1.285440 1.912526
min 0.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
25% 5.000000 2.000000 33.000000 0.000000 10.000000 0.000000 0.000000 0.000000 6.000000
50% 6.000000 4.000000 45.000000 1.000000 15.000000 0.000000 0.000000 0.000000 9.000000
75% 7.000000 6.000000 57.000000 2.000000 20.000000 0.000000 0.000000 1.000000 9.000000
max 9.000000 14.000000 132.000000 6.000000 81.000000 42.000000 76.000000 21.000000 16.000000

Skewness¶

negative = left skew
positive = right skew
0        = no skew
In [14]:
diabetic[numerical].skew()
Out[14]:
age                   -0.637067
time_in_hospital       1.144088
num_lab_procedures    -0.327120
num_procedures         1.317758
num_medications        1.329845
number_outpatient      8.878266
number_emergency      22.791751
number_inpatient       3.538992
number_diagnoses      -0.967006
dtype: float64

Univariate Analysis¶

In [15]:
diabetic[numerical].hist(figsize=(20,10));
  • age has a left skew
  • time_in_hospital has a right skew
  • num_lab_procedures has a left skew
  • num_procedures has a right skew
  • num_medications has a right skew
  • num_outpatient has a right skew
  • number_emergency has a right skew
  • number inpatient has a right skew
  • number diagnoses has a left skew

Categorical Features¶

In [16]:
print (f'{round(diabetic["gender"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="gender", title='Gender', width=500, height=500)
fig.show()
gender
0    54.03
1    45.97
Name: proportion, dtype: float64
In [17]:
print (f'{round(diabetic["race"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="race", title='race', width=600, height=600)
fig.show()
race
3    75.79
1    20.22
4     1.81
5     1.52
2     0.66
Name: proportion, dtype: float64
In [18]:
print (f'{round(diabetic["admission_type_id"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="admission_type_id", title='admission_type_id', width=750, height=750)
fig.show()
admission_type_id
1    59.32
3    20.69
2    19.95
7     0.02
4     0.01
Name: proportion, dtype: float64
In [19]:
print (f'{round(diabetic["A1Cresult"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="A1Cresult", title='A1Cresult', width=750, height=750)
fig.show()
A1Cresult
0    82.97
3     7.94
1     5.27
2     3.82
Name: proportion, dtype: float64
In [20]:
print (f'{round(diabetic["diabetesMed"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="diabetesMed", title='diabetesMed', width=750, height=750)
fig.show()
diabetesMed
1    77.54
0    22.46
Name: proportion, dtype: float64
In [21]:
print (f'{round(diabetic["insulin"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="insulin", title='insulin', width=750, height=750)
fig.show()
insulin
0    45.23
2    31.03
1    12.55
3    11.19
Name: proportion, dtype: float64
In [22]:
print (f'{round(diabetic["age"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="age", title='age', width=750, height=750)
fig.show()
age
7    25.10
6    22.31
5    17.09
8    16.94
4     9.44
3     3.75
9     2.75
2     1.70
1     0.74
0     0.17
Name: proportion, dtype: float64

Bivariate Analysis¶

Gender and Readmission¶

In [23]:
print (f'A male has a probability of {round(diabetic[diabetic["gender"]==1]["readmitted"].mean()*100,2)} % to readmit within 30 days')

print()

print (f'A female has a probability of  {round(diabetic[diabetic["gender"]==0]["readmitted"].mean()*100,2)} % to readmit within 30 days')
A male has a probability of 11.16 % to readmit within 30 days

A female has a probability of  11.3 % to readmit within 30 days
In [24]:
fig = px.histogram(diabetic, x="gender", color="readmitted",width=600, height=600)
fig.show()

Race and Readmission¶

In [25]:
print (f'A Caucasian has a probability of {round(diabetic[diabetic["race"]==3]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A African American has a probability of {round(diabetic[diabetic["race"]==1]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A Asian has a probability of {round(diabetic[diabetic["race"]==2]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A Hispanic has a probability of {round(diabetic[diabetic["race"]==4]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A Other has a probability of {round(diabetic[diabetic["race"]==5]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()
A Caucasian has a probability of 11.36 % of readmitting within 30 days

A African American has a probability of 11.06 % of readmitting within 30 days

A Asian has a probability of 10.38 % of readmitting within 30 days

A Hispanic has a probability of 9.56 % of readmitting within 30 days

A Other has a probability of 9.45 % of readmitting within 30 days

In [26]:
fig = px.histogram(diabetic, x="race", color="readmitted",width=600, height=600)
fig.show()

Admission Type and Readmission¶

In [27]:
print (f'A patient admitted for "Emergency" has a probability of {round(diabetic[diabetic["admission_type_id"]==1]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A patient admitted for "Urgent" has a probability of {round(diabetic[diabetic["admission_type_id"]==2]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A patient admitted for "Elective" has a probability of {round(diabetic[diabetic["admission_type_id"]==3]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A patient admitted for "Newborn" has a probability of {round(diabetic[diabetic["admission_type_id"]==4]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A patient admitted for "Trauma Center" has a probability of {round(diabetic[diabetic["admission_type_id"]==7]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()
A patient admitted for "Emergency" has a probability of 11.54 % of readmitting within 30 days

A patient admitted for "Urgent" has a probability of 11.26 % of readmitting within 30 days

A patient admitted for "Elective" has a probability of 10.35 % of readmitting within 30 days

A patient admitted for "Newborn" has a probability of 10.0 % of readmitting within 30 days

A patient admitted for "Trauma Center" has a probability of 0.0 % of readmitting within 30 days

In [28]:
fig = px.histogram(diabetic, x="admission_type_id", color="readmitted",width=600, height=600)
fig.show()

A1CResult and Readmission¶

In [29]:
print (f'A patient with an A1Cresult of "None" has a probability of {round(diabetic[diabetic["A1Cresult"]==0]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A patient with an A1Cresult of "Norm" has a probability of {round(diabetic[diabetic["A1Cresult"]==1]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A patient with an A1Cresult of ">7" has a probability of {round(diabetic[diabetic["A1Cresult"]==2]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()

print (f'A patient with an A1Cresult of ">8" has a probability of {round(diabetic[diabetic["A1Cresult"]==3]["readmitted"].mean()*100,2)} % of readmitting within 30 days')

print()
A patient with an A1Cresult of "None" has a probability of 11.53 % of readmitting within 30 days

A patient with an A1Cresult of "Norm" has a probability of 9.55 % of readmitting within 30 days

A patient with an A1Cresult of ">7" has a probability of 10.04 % of readmitting within 30 days

A patient with an A1Cresult of ">8" has a probability of 9.84 % of readmitting within 30 days

In [30]:
fig = px.histogram(diabetic, x="A1Cresult", color="readmitted",width=600, height=600)
fig.show()

Metrics of Importance¶

Mutual Info Score¶

In [31]:
from sklearn.metrics import mutual_info_score
def cat_mut_inf(series):
    return mutual_info_score(series, diabetic['readmitted']) 

diabetic_cat = diabetic[categorical].apply(cat_mut_inf) 
diabetic_cat = diabetic_cat.sort_values(ascending=False).to_frame(name='mutual_info_score') 
diabetic_cat
Out[31]:
mutual_info_score
readmitted 3.513609e-01
discharge_disposition_id 8.159116e-03
insulin 1.011328e-03
diag_1 1.000959e-03
diabetesMed 3.780887e-04
change 2.860761e-04
metformin 2.782392e-04
admission_source_id 2.582642e-04
A1Cresult 2.226939e-04
admission_type_id 1.375703e-04
glipizide 6.616885e-05
race 6.380132e-05
repaglinide 5.370252e-05
pioglitazone 5.324613e-05
glimepiride 3.597550e-05
miglitol 3.188722e-05
rosiglitazone 2.577645e-05
chlorpropamide 1.407742e-05
glyburide 1.399542e-05
glyburide-metformin 1.322640e-05
acarbose 1.030866e-05
nateglinide 9.086540e-06
troglitazone 2.822069e-06
gender 2.384694e-06
tolbutamide 2.138270e-06
glimepiride-pioglitazone 1.411026e-06
metformin-pioglitazone 1.411026e-06
acetohexamide 1.411026e-06
glipizide-metformin 1.074420e-06
tolazamide 5.743538e-08
metformin-rosiglitazone 0.000000e+00
citoglipton 0.000000e+00
examide 0.000000e+00
In [32]:
diabetic[numerical].corr()
Out[32]:
age time_in_hospital num_lab_procedures num_procedures num_medications number_outpatient number_emergency number_inpatient number_diagnoses
age 1.000000 0.107300 0.031678 -0.022395 0.047998 0.020251 -0.088964 -0.041148 0.254452
time_in_hospital 0.107300 1.000000 0.326188 0.188776 0.469876 -0.000524 -0.006881 0.069348 0.231017
num_lab_procedures 0.031678 0.326188 1.000000 0.035141 0.280833 0.021724 0.010634 0.042225 0.158492
num_procedures -0.022395 0.188776 0.035141 1.000000 0.375691 -0.019010 -0.037061 -0.067005 0.069733
num_medications 0.047998 0.469876 0.280833 0.375691 1.000000 0.046070 0.014358 0.068944 0.263503
number_outpatient 0.020251 -0.000524 0.021724 -0.019010 0.046070 1.000000 0.093707 0.117058 0.093438
number_emergency -0.088964 -0.006881 0.010634 -0.037061 0.014358 0.093707 1.000000 0.270503 0.049938
number_inpatient -0.041148 0.069348 0.042225 -0.067005 0.068944 0.117058 0.270503 1.000000 0.105193
number_diagnoses 0.254452 0.231017 0.158492 0.069733 0.263503 0.093438 0.049938 0.105193 1.000000

There are barely any correlation between numerical features.

In [33]:
diabetic.groupby('readmitted')[numerical].mean()
Out[33]:
age time_in_hospital num_lab_procedures num_procedures num_medications number_outpatient number_emergency number_inpatient number_diagnoses
readmitted
0 6.073505 4.336842 43.514067 1.347914 15.884246 0.347114 0.187337 0.576847 7.483785
1 6.171094 4.745414 44.779675 1.278410 16.895425 0.429370 0.375922 1.249315 7.793169
In [34]:
diabetic[['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
                      'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmitted']].corr()
Out[34]:
age time_in_hospital num_lab_procedures num_procedures num_medications number_outpatient number_emergency number_inpatient number_diagnoses readmitted
age 1.000000 0.107300 0.031678 -0.022395 0.047998 0.020251 -0.088964 -0.041148 0.254452 0.019202
time_in_hospital 0.107300 1.000000 0.326188 0.188776 0.469876 -0.000524 -0.006881 0.069348 0.231017 0.043338
num_lab_procedures 0.031678 0.326188 1.000000 0.035141 0.280833 0.021724 0.010634 0.042225 0.158492 0.020611
num_procedures -0.022395 0.188776 0.035141 1.000000 0.375691 -0.019010 -0.037061 -0.067005 0.069733 -0.012842
num_medications 0.047998 0.469876 0.280833 0.375691 1.000000 0.046070 0.014358 0.068944 0.263503 0.039314
number_outpatient 0.020251 -0.000524 0.021724 -0.019010 0.046070 1.000000 0.093707 0.117058 0.093438 0.020657
number_emergency -0.088964 -0.006881 0.010634 -0.037061 0.014358 0.093707 1.000000 0.270503 0.049938 0.060681
number_inpatient -0.041148 0.069348 0.042225 -0.067005 0.068944 0.117058 0.270503 1.000000 0.105193 0.165195
number_diagnoses 0.254452 0.231017 0.158492 0.069733 0.263503 0.093438 0.049938 0.105193 1.000000 0.051082
readmitted 0.019202 0.043338 0.020611 -0.012842 0.039314 0.020657 0.060681 0.165195 0.051082 1.000000
  • When age increases, the mean score on the readmitted also increases.
  • Correlations with the target variable are very small.
In [35]:
fig = px.scatter(diabetic, x='age', y='number_diagnoses', title='Age & # of Diagnoses ',color='readmitted', hover_data = diabetic[['readmitted']])
fig.show()
In [36]:
fig = px.scatter(diabetic, x='num_lab_procedures', y='num_medications', title='# Lab Procedures and Medications',color='readmitted', hover_data = diabetic[['readmitted']])
fig.show()

Converting Categorical Features into Numeric Features¶

In [37]:
diabetic = pd.get_dummies(diabetic, drop_first = False)

Breaking the data up into Train and Test¶

In [38]:
train_df, valid_df, test_df = np.split(diabetic.sample(frac=1, random_state=42), 
                                       [int(.7*len(diabetic)), int(0.85*len(diabetic))])
train_df = train_df.reset_index(drop = True)
valid_df = valid_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)
In [39]:
diabetic.readmitted.value_counts()
Out[39]:
readmitted
0    74961
1     9486
Name: count, dtype: int64
In [40]:
train_df.readmitted.value_counts()
Out[40]:
readmitted
0    52499
1     6613
Name: count, dtype: int64
In [41]:
valid_df.readmitted.value_counts()
Out[41]:
readmitted
0    11230
1     1437
Name: count, dtype: int64
In [42]:
test_df.readmitted.value_counts()
Out[42]:
readmitted
0    11232
1     1436
Name: count, dtype: int64

Treating the Imbalance in Data¶

Imbalance in the data means that one of the classes in the data is too less as compared to the others. Typically, it is better to balance the data in some way to give the positives more weight. There are 3 strategies that are typically utilized:

Sub-sample the more dominant class: use a random subset of the negatives Over-sample the imbalanced class: use the same positive samples multiple times Create synthetic positive data Usually, you will want to use the latter two methods if you only have a handful of positive cases. Since we have a few thousand positive cases, let's use the sub-sample approach. Here, we will create a balanced training data set that has 50% positive and 50% negative. You can also play with this ratio to see if you can get an improvement.

In [43]:
def calc_prevalence(y_actual):
    
    '''
    This function is to understand the ratio/distribution of the classes that we are going to predict for.
    
    Params:
    1. y_actual: The target feature
    
    Return:
    1. (sum(y_actual)/len(y_actual)): The ratio of the postive class in the comlpete data.
    '''
    
    return (sum(y_actual)/len(y_actual))
In [44]:
# split the training data into positive and negative
rows_pos = train_df.readmitted == 1
df_train_pos = train_df.loc[rows_pos]
df_train_neg = train_df.loc[~rows_pos]

# merge the balanced data
sample_size = min(len(df_train_pos), len(df_train_neg))
diabetic_df_balanced = pd.concat([df_train_pos, df_train_neg.sample(n=sample_size, random_state=111)], axis=0)

# shuffle the order of training samples 
diabetic_df_balanced = diabetic_df_balanced.sample(n=len(diabetic_df_balanced), random_state=42).reset_index(drop=True)

print('Train balanced prevalence(n=%d):%.3f'%(len(diabetic_df_balanced), \
                                                calc_prevalence(diabetic_df_balanced.readmitted.values)))
Train balanced prevalence(n=13226):0.500
In [45]:
diabetic_df_balanced.readmitted.value_counts()
Out[45]:
readmitted
1    6613
0    6613
Name: count, dtype: int64
In [46]:
X_train = diabetic_df_balanced.drop('readmitted',axis=1)

y_train = diabetic_df_balanced['readmitted']

X_valid = valid_df.drop('readmitted',axis=1)

y_valid = valid_df['readmitted']

X_test = test_df.drop('readmitted',axis=1)

y_test = test_df['readmitted']
In [47]:
scaler=StandardScaler()
X_train[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']] = pd.DataFrame(scaler.fit_transform(X_train[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']]),columns=['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient'])
X_valid[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']] = pd.DataFrame(scaler.transform(X_valid[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']]),columns=['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient'])
X_test[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']] = pd.DataFrame(scaler.transform(X_test[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']]),columns=['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient'])

Creating and Understanding Models¶

In [48]:
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh = 0.5):
    
    '''
    This function calculates all the metrics to asses the machine learning models.
    
    Params:
    1. y_actual: The actual values for the target variable.
    2. y_pred: The predicted values for the target variable.
    3. thresh: The threshold for the probability to be considered as a positive class. Default value 0.5
    
    Return:
    1. AUC
    2. Accuracy
    3. Recall
    4. Precision
    5. Specificity
    '''
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

Linear Regression¶

In [49]:
lnr = LinearRegression()
lnr.fit(X_train, y_train)


y_valid_preds = lnr.predict(X_valid)
In [50]:
y_valid_preds
Out[50]:
array([0.69345324, 0.41855594, 0.76909945, ..., 0.49366581, 0.49793699,
       0.431336  ])
In [51]:
diabetic['readmitted'].value_counts()
Out[51]:
readmitted
0    74961
1     9486
Name: count, dtype: int64

Logisitic Regression¶

In [52]:
lr=LogisticRegression(random_state = 42, solver = 'newton-cg', max_iter = 200)
lr.fit(X_train, y_train)

y_valid_preds = lr.predict_proba(X_valid)[:,1]

print('Metrics for Validation data:')

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, 0.5)
Metrics for Validation data:
AUC:0.644
accuracy:0.662
recall:0.515
precision:0.171
specificity:0.680
prevalence:0.113
 
In [53]:
importance = abs(lr.coef_[0])
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns

print('Top 5 important variables:')
for i in indices:
    print(features[i], importance[i])
Top 5 important variables:
tolbutamide 0.5691542362986044
tolazamide 0.5490985038584199
number_inpatient 0.4555662342614939
miglitol 0.3715310590819527
chlorpropamide 0.1787726167684798

Explaining Results¶

KNN Model¶

In [54]:
knn = KNeighborsClassifier(n_neighbors = 100)
knn.fit(X_train, y_train)

knn_preds = knn.predict_proba(X_valid)[:,1]

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,knn_preds, 0.5)
AUC:0.492
accuracy:0.508
recall:0.473
precision:0.110
specificity:0.440
prevalence:0.113
 
In [55]:
# Corrected the loss parameter to 'log_loss'
sgdc = SGDClassifier(loss='log_loss', alpha=0.1, random_state=42)
sgdc.fit(X_train, y_train)

sgd_preds = sgdc.predict_proba(X_valid)[:, 1]

print('Stochastic Gradient Descent')
print('Validation:')
sgdc_valid_auc, sgdc_valid_accuracy, sgdc_valid_recall, \
                sgdc_valid_precision, sgdc_valid_specificity = print_report(y_valid,sgd_preds, 0.5)
Stochastic Gradient Descent
Validation:
AUC:0.499
accuracy:0.885
recall:0.003
precision:0.121
specificity:0.997
prevalence:0.113
 
In [56]:
dc_clf = DecisionTreeClassifier(random_state=42, max_depth = 10)
dc_clf.fit(X_train, y_train)

dc_preds_proba = dc_clf.predict_proba(X_valid)[:,1]
dc_preds = dc_clf.predict(X_valid)

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,dc_preds_proba, 0.5)
AUC:0.621
accuracy:0.576
recall:0.617
precision:0.156
specificity:0.556
prevalence:0.113
 
In [57]:
importance = dc_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns

print('Top 5 important variables:')
for i in indices:
    print(features[i], importance[i])
Top 5 important variables:
number_inpatient 0.2048007112320039
discharge_disposition_id 0.17745895981290255
Unnamed: 0 0.1157032083947385
num_medications 0.06891705668441435
num_lab_procedures 0.06413676475721254
In [58]:
rf_clf = RandomForestClassifier(random_state=111, max_depth = 6)

rf_clf.fit(X_train, y_train)

rf_preds = rf_clf.predict(X_valid)
rf_preds_proba = rf_clf.predict_proba(X_valid)[:, 1]

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,rf_preds_proba, 0.5)
AUC:0.664
accuracy:0.599
recall:0.644
precision:0.168
specificity:0.593
prevalence:0.113
 
In [59]:
rf_clf = RandomForestClassifier(random_state=111, max_depth = 6)
rf_clf.fit(X_train, y_train)

importance = rf_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns

print('Top 5 important variables:')
for i in indices:
    print(features[i], importance[i])
Top 5 important variables:
number_inpatient 0.3027234895031829
discharge_disposition_id 0.16583873959191758
number_emergency 0.06512290444808795
num_medications 0.05209133801434666
number_diagnoses 0.049957598848719365
In [60]:
lsvc_clf = LinearSVC(random_state=111)
lsvc_clf.fit(X_train, y_train)

lsvc_preds = lsvc_clf.decision_function(X_valid)

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,lsvc_preds, 0.5)
AUC:0.644
accuracy:0.878
recall:0.073
precision:0.324
specificity:0.980
prevalence:0.113
 
In [61]:
importance = np.abs(lsvc_clf.coef_[0])
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns

print('Top 5 important variables:')
for i in indices:
    print(features[i], importance[i])
Top 5 important variables:
number_inpatient 0.19860000502746336
diabetesMed 0.05867793199326875
admission_type_id 0.052252363330388296
metformin 0.048384335855877904
glimepiride 0.04765938820341512
In [62]:
gb_clf = GradientBoostingClassifier(n_estimators = 100, criterion='friedman_mse', learning_rate = 1.0, max_depth = 3,\
                                    random_state = 111)

gb_clf.fit(X_train, y_train)
gb_preds = gb_clf.predict(X_valid)
gb_preds_proba = gb_clf.predict_proba(X_valid)[:, 1]

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,gb_preds_proba, 0.5)
AUC:0.625
accuracy:0.600
recall:0.571
precision:0.156
specificity:0.604
prevalence:0.113
 
In [63]:
importance = gb_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns

print('Top 5 important variables:')
for i in indices:
    print(features[i], importance[i])
Top 5 important variables:
number_inpatient 0.20424845705452066
discharge_disposition_id 0.18498586187035168
Unnamed: 0 0.1723816558576928
num_medications 0.06102020776719027
num_lab_procedures 0.05517155454174269
In [64]:
xgb_clf = xgb.XGBClassifier(max_depth=3, learning_rate = 1.0, use_label_encoder = False,\
                            eval_metric = 'logloss')
xgb_clf.fit(X_train, y_train)

xgb_preds = xgb_clf.predict(X_valid)
xgb_preds_proba = xgb_clf.predict_proba(X_valid)[:, 1]

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,xgb_preds_proba, 0.5)
AUC:0.643
accuracy:0.608
recall:0.598
precision:0.164
specificity:0.609
prevalence:0.113
 
In [65]:
importance = xgb_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns

print('Top 5 important variables:')
for i in indices:
    print(f"{features[i]}: {importance[i]}")
Top 5 important variables:
number_inpatient: 0.16212581098079681
discharge_disposition_id: 0.10900657624006271
admission_source_id: 0.042793795466423035
change: 0.034398604184389114
number_emergency: 0.0335787869989872
In [66]:
catb=CatBoostClassifier(iterations=200, depth=3, learning_rate=1.0, random_state = 111)
catb.fit(X_train, y_train)
catb_preds = catb.predict_proba(X_valid)[:, 1]

lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,catb_preds, 0.5)
0:	learn: 0.6648965	total: 162ms	remaining: 32.2s
1:	learn: 0.6600847	total: 168ms	remaining: 16.6s
2:	learn: 0.6526058	total: 173ms	remaining: 11.4s
3:	learn: 0.6509617	total: 178ms	remaining: 8.73s
4:	learn: 0.6483122	total: 184ms	remaining: 7.16s
5:	learn: 0.6457996	total: 189ms	remaining: 6.12s
6:	learn: 0.6449968	total: 194ms	remaining: 5.35s
7:	learn: 0.6437398	total: 200ms	remaining: 4.79s
8:	learn: 0.6425606	total: 205ms	remaining: 4.36s
9:	learn: 0.6416257	total: 211ms	remaining: 4.01s
10:	learn: 0.6407779	total: 217ms	remaining: 3.73s
11:	learn: 0.6395777	total: 223ms	remaining: 3.49s
12:	learn: 0.6381783	total: 228ms	remaining: 3.28s
13:	learn: 0.6373445	total: 234ms	remaining: 3.11s
14:	learn: 0.6362202	total: 239ms	remaining: 2.95s
15:	learn: 0.6351042	total: 245ms	remaining: 2.82s
16:	learn: 0.6342171	total: 251ms	remaining: 2.7s
17:	learn: 0.6333739	total: 257ms	remaining: 2.6s
18:	learn: 0.6323785	total: 262ms	remaining: 2.5s
19:	learn: 0.6316892	total: 267ms	remaining: 2.41s
20:	learn: 0.6308403	total: 272ms	remaining: 2.32s
21:	learn: 0.6304525	total: 278ms	remaining: 2.25s
22:	learn: 0.6297335	total: 283ms	remaining: 2.18s
23:	learn: 0.6288386	total: 289ms	remaining: 2.12s
24:	learn: 0.6279195	total: 295ms	remaining: 2.06s
25:	learn: 0.6253405	total: 300ms	remaining: 2.01s
26:	learn: 0.6250961	total: 306ms	remaining: 1.96s
27:	learn: 0.6244931	total: 312ms	remaining: 1.92s
28:	learn: 0.6238170	total: 317ms	remaining: 1.87s
29:	learn: 0.6233927	total: 324ms	remaining: 1.83s
30:	learn: 0.6225386	total: 330ms	remaining: 1.8s
31:	learn: 0.6217847	total: 335ms	remaining: 1.76s
32:	learn: 0.6213437	total: 341ms	remaining: 1.73s
33:	learn: 0.6203912	total: 346ms	remaining: 1.69s
34:	learn: 0.6196070	total: 352ms	remaining: 1.66s
35:	learn: 0.6190562	total: 357ms	remaining: 1.63s
36:	learn: 0.6180958	total: 363ms	remaining: 1.6s
37:	learn: 0.6174709	total: 369ms	remaining: 1.57s
38:	learn: 0.6167741	total: 374ms	remaining: 1.54s
39:	learn: 0.6162518	total: 380ms	remaining: 1.52s
40:	learn: 0.6156439	total: 385ms	remaining: 1.49s
41:	learn: 0.6146355	total: 391ms	remaining: 1.47s
42:	learn: 0.6143936	total: 396ms	remaining: 1.45s
43:	learn: 0.6138505	total: 402ms	remaining: 1.43s
44:	learn: 0.6137138	total: 408ms	remaining: 1.4s
45:	learn: 0.6129805	total: 413ms	remaining: 1.38s
46:	learn: 0.6120866	total: 419ms	remaining: 1.36s
47:	learn: 0.6117924	total: 425ms	remaining: 1.34s
48:	learn: 0.6108493	total: 431ms	remaining: 1.33s
49:	learn: 0.6105826	total: 437ms	remaining: 1.31s
50:	learn: 0.6100997	total: 443ms	remaining: 1.29s
51:	learn: 0.6091630	total: 448ms	remaining: 1.27s
52:	learn: 0.6084778	total: 453ms	remaining: 1.26s
53:	learn: 0.6078771	total: 459ms	remaining: 1.24s
54:	learn: 0.6071188	total: 465ms	remaining: 1.23s
55:	learn: 0.6063470	total: 470ms	remaining: 1.21s
56:	learn: 0.6057888	total: 475ms	remaining: 1.19s
57:	learn: 0.6057100	total: 481ms	remaining: 1.18s
58:	learn: 0.6052223	total: 487ms	remaining: 1.16s
59:	learn: 0.6047211	total: 493ms	remaining: 1.15s
60:	learn: 0.6041776	total: 498ms	remaining: 1.13s
61:	learn: 0.6038092	total: 504ms	remaining: 1.12s
62:	learn: 0.6033687	total: 509ms	remaining: 1.11s
63:	learn: 0.6028513	total: 514ms	remaining: 1.09s
64:	learn: 0.6020972	total: 520ms	remaining: 1.08s
65:	learn: 0.6016110	total: 525ms	remaining: 1.07s
66:	learn: 0.6013313	total: 532ms	remaining: 1.05s
67:	learn: 0.6006054	total: 539ms	remaining: 1.04s
68:	learn: 0.6004157	total: 545ms	remaining: 1.03s
69:	learn: 0.5998223	total: 551ms	remaining: 1.02s
70:	learn: 0.5997358	total: 556ms	remaining: 1.01s
71:	learn: 0.5992190	total: 562ms	remaining: 998ms
72:	learn: 0.5986042	total: 567ms	remaining: 986ms
73:	learn: 0.5980407	total: 572ms	remaining: 975ms
74:	learn: 0.5973969	total: 578ms	remaining: 963ms
75:	learn: 0.5970157	total: 585ms	remaining: 954ms
76:	learn: 0.5967069	total: 590ms	remaining: 943ms
77:	learn: 0.5959242	total: 597ms	remaining: 934ms
78:	learn: 0.5953736	total: 603ms	remaining: 924ms
79:	learn: 0.5948974	total: 609ms	remaining: 913ms
80:	learn: 0.5942228	total: 615ms	remaining: 903ms
81:	learn: 0.5935498	total: 620ms	remaining: 893ms
82:	learn: 0.5929510	total: 626ms	remaining: 883ms
83:	learn: 0.5926026	total: 632ms	remaining: 873ms
84:	learn: 0.5919484	total: 638ms	remaining: 863ms
85:	learn: 0.5912187	total: 644ms	remaining: 853ms
86:	learn: 0.5905671	total: 648ms	remaining: 842ms
87:	learn: 0.5897125	total: 653ms	remaining: 831ms
88:	learn: 0.5889698	total: 658ms	remaining: 821ms
89:	learn: 0.5884948	total: 664ms	remaining: 812ms
90:	learn: 0.5878020	total: 669ms	remaining: 802ms
91:	learn: 0.5873910	total: 674ms	remaining: 792ms
92:	learn: 0.5868850	total: 680ms	remaining: 782ms
93:	learn: 0.5863924	total: 685ms	remaining: 773ms
94:	learn: 0.5858218	total: 691ms	remaining: 764ms
95:	learn: 0.5852027	total: 697ms	remaining: 756ms
96:	learn: 0.5845692	total: 703ms	remaining: 747ms
97:	learn: 0.5838478	total: 709ms	remaining: 738ms
98:	learn: 0.5832592	total: 715ms	remaining: 729ms
99:	learn: 0.5827199	total: 720ms	remaining: 720ms
100:	learn: 0.5820375	total: 726ms	remaining: 712ms
101:	learn: 0.5819110	total: 732ms	remaining: 703ms
102:	learn: 0.5817337	total: 737ms	remaining: 694ms
103:	learn: 0.5815150	total: 743ms	remaining: 686ms
104:	learn: 0.5808800	total: 749ms	remaining: 678ms
105:	learn: 0.5805232	total: 755ms	remaining: 669ms
106:	learn: 0.5801613	total: 761ms	remaining: 661ms
107:	learn: 0.5797089	total: 766ms	remaining: 653ms
108:	learn: 0.5790927	total: 771ms	remaining: 644ms
109:	learn: 0.5784704	total: 777ms	remaining: 636ms
110:	learn: 0.5782979	total: 783ms	remaining: 628ms
111:	learn: 0.5777635	total: 789ms	remaining: 620ms
112:	learn: 0.5774884	total: 795ms	remaining: 612ms
113:	learn: 0.5768454	total: 800ms	remaining: 604ms
114:	learn: 0.5763961	total: 806ms	remaining: 596ms
115:	learn: 0.5755184	total: 811ms	remaining: 587ms
116:	learn: 0.5751418	total: 817ms	remaining: 579ms
117:	learn: 0.5746229	total: 823ms	remaining: 572ms
118:	learn: 0.5741923	total: 828ms	remaining: 564ms
119:	learn: 0.5741659	total: 834ms	remaining: 556ms
120:	learn: 0.5738050	total: 840ms	remaining: 548ms
121:	learn: 0.5732728	total: 845ms	remaining: 540ms
122:	learn: 0.5730028	total: 851ms	remaining: 533ms
123:	learn: 0.5725906	total: 856ms	remaining: 525ms
124:	learn: 0.5723736	total: 862ms	remaining: 517ms
125:	learn: 0.5719188	total: 866ms	remaining: 509ms
126:	learn: 0.5714192	total: 871ms	remaining: 501ms
127:	learn: 0.5710013	total: 877ms	remaining: 493ms
128:	learn: 0.5705755	total: 883ms	remaining: 486ms
129:	learn: 0.5700396	total: 887ms	remaining: 478ms
130:	learn: 0.5695674	total: 892ms	remaining: 470ms
131:	learn: 0.5689367	total: 897ms	remaining: 462ms
132:	learn: 0.5682862	total: 903ms	remaining: 455ms
133:	learn: 0.5677623	total: 908ms	remaining: 447ms
134:	learn: 0.5676602	total: 914ms	remaining: 440ms
135:	learn: 0.5671312	total: 920ms	remaining: 433ms
136:	learn: 0.5665800	total: 925ms	remaining: 426ms
137:	learn: 0.5660965	total: 931ms	remaining: 418ms
138:	learn: 0.5657441	total: 936ms	remaining: 411ms
139:	learn: 0.5652879	total: 942ms	remaining: 404ms
140:	learn: 0.5648533	total: 948ms	remaining: 397ms
141:	learn: 0.5644145	total: 953ms	remaining: 389ms
142:	learn: 0.5642917	total: 959ms	remaining: 382ms
143:	learn: 0.5637970	total: 963ms	remaining: 375ms
144:	learn: 0.5632082	total: 969ms	remaining: 368ms
145:	learn: 0.5626776	total: 975ms	remaining: 361ms
146:	learn: 0.5620795	total: 981ms	remaining: 354ms
147:	learn: 0.5615993	total: 986ms	remaining: 346ms
148:	learn: 0.5610694	total: 991ms	remaining: 339ms
149:	learn: 0.5607280	total: 997ms	remaining: 332ms
150:	learn: 0.5605829	total: 1s	remaining: 325ms
151:	learn: 0.5601025	total: 1.01s	remaining: 319ms
152:	learn: 0.5596164	total: 1.01s	remaining: 312ms
153:	learn: 0.5591669	total: 1.02s	remaining: 305ms
154:	learn: 0.5586668	total: 1.02s	remaining: 298ms
155:	learn: 0.5586449	total: 1.03s	remaining: 291ms
156:	learn: 0.5584920	total: 1.04s	remaining: 284ms
157:	learn: 0.5581205	total: 1.04s	remaining: 277ms
158:	learn: 0.5576830	total: 1.05s	remaining: 270ms
159:	learn: 0.5571034	total: 1.05s	remaining: 263ms
160:	learn: 0.5567958	total: 1.06s	remaining: 256ms
161:	learn: 0.5563886	total: 1.06s	remaining: 250ms
162:	learn: 0.5558839	total: 1.07s	remaining: 243ms
163:	learn: 0.5555623	total: 1.07s	remaining: 236ms
164:	learn: 0.5549317	total: 1.08s	remaining: 229ms
165:	learn: 0.5543260	total: 1.08s	remaining: 222ms
166:	learn: 0.5536682	total: 1.09s	remaining: 216ms
167:	learn: 0.5529090	total: 1.09s	remaining: 209ms
168:	learn: 0.5524482	total: 1.1s	remaining: 202ms
169:	learn: 0.5517712	total: 1.11s	remaining: 195ms
170:	learn: 0.5514550	total: 1.11s	remaining: 189ms
171:	learn: 0.5509897	total: 1.12s	remaining: 182ms
172:	learn: 0.5506147	total: 1.12s	remaining: 175ms
173:	learn: 0.5501039	total: 1.13s	remaining: 169ms
174:	learn: 0.5499846	total: 1.13s	remaining: 162ms
175:	learn: 0.5499359	total: 1.14s	remaining: 155ms
176:	learn: 0.5496269	total: 1.14s	remaining: 149ms
177:	learn: 0.5496103	total: 1.15s	remaining: 142ms
178:	learn: 0.5491435	total: 1.16s	remaining: 136ms
179:	learn: 0.5490699	total: 1.16s	remaining: 129ms
180:	learn: 0.5486602	total: 1.17s	remaining: 123ms
181:	learn: 0.5484101	total: 1.17s	remaining: 116ms
182:	learn: 0.5479443	total: 1.18s	remaining: 110ms
183:	learn: 0.5477453	total: 1.19s	remaining: 103ms
184:	learn: 0.5467985	total: 1.19s	remaining: 96.6ms
185:	learn: 0.5463339	total: 1.2s	remaining: 90.1ms
186:	learn: 0.5458361	total: 1.2s	remaining: 83.7ms
187:	learn: 0.5451832	total: 1.21s	remaining: 77.2ms
188:	learn: 0.5446521	total: 1.22s	remaining: 70.7ms
189:	learn: 0.5440583	total: 1.22s	remaining: 64.4ms
190:	learn: 0.5436080	total: 1.23s	remaining: 57.9ms
191:	learn: 0.5432104	total: 1.24s	remaining: 51.5ms
192:	learn: 0.5428181	total: 1.24s	remaining: 45ms
193:	learn: 0.5423089	total: 1.25s	remaining: 38.6ms
194:	learn: 0.5418352	total: 1.25s	remaining: 32.2ms
195:	learn: 0.5412436	total: 1.26s	remaining: 25.7ms
196:	learn: 0.5406283	total: 1.27s	remaining: 19.3ms
197:	learn: 0.5401505	total: 1.27s	remaining: 12.9ms
198:	learn: 0.5396560	total: 1.28s	remaining: 6.42ms
199:	learn: 0.5394535	total: 1.28s	remaining: 0us
AUC:0.634
accuracy:0.609
recall:0.583
precision:0.161
specificity:0.612
prevalence:0.113
 
In [67]:
recall_scoring = make_scorer(recall_score)
In [68]:
dc_grid = {'max_features':['auto','sqrt'], # maximum number of features to use at each split
           'max_depth':range(1,11,1), # maximum depth of the tree
           'min_samples_split':range(2,10,2), # minimum number of samples to split a node
           'criterion':['gini','entropy']} # criterion for evaluating a split

dc_random = RandomizedSearchCV(estimator = dc_clf, param_distributions = dc_grid, 
                               n_iter = 20, cv = 2, scoring=recall_scoring,
                               verbose = 1, random_state = 111)

dc_random.fit(X_train, y_train)

dc_random.best_params_

dc_hp_preds = dc_random.best_estimator_.predict(X_valid)
dc_hp_preds_proba = dc_random.best_estimator_.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid, dc_hp_preds_proba)
Fitting 2 folds for each of 20 candidates, totalling 40 fits
Out[68]:
0.5322862696909251
In [69]:
recall_score(y_valid, dc_hp_preds)
Out[69]:
0.6798886569241476
In [70]:
xgb_grid = params = {
        'min_child_weight': [1, 5, 8, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 0.9, 1.0],
        'max_depth': [3, 4, 5]
        } # criterion for evaluating a split

xgb_random = GridSearchCV(estimator = xgb_clf, param_grid = xgb_grid, 
                               cv = 2, scoring = recall_scoring,
                               verbose = 1)

xgb_random.fit(X_train, y_train)

xgb_random.best_params_

xgb_hp_preds = xgb_random.best_estimator_.predict(X_valid)
xgb_hp_preds_proba = xgb_random.best_estimator_.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid, xgb_hp_preds_proba)
Fitting 2 folds for each of 720 candidates, totalling 1440 fits
Out[70]:
0.6553197798173325
In [71]:
recall_score(y_valid, xgb_hp_preds)
Out[71]:
0.6339596381350034
In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, recall_score, precision_score, confusion_matrix
import numpy as np

# define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [6, 8],
    'max_features': ['sqrt', 'log2']
}

# create a function to calculate all the evaluation metrics
def print_report(y_true, y_pred_prob, threshold):
    y_pred = (y_pred_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    auc = roc_auc_score(y_true, y_pred_prob)
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    specificity = tn / (tn + fp)
    prevalence = np.mean(y_true)
    print("AUC:{:.3f}".format(auc))
    print("accuracy:{:.3f}".format(accuracy))
    print("recall:{:.3f}".format(recall))
    print("precision:{:.3f}".format(precision))
    print("specificity:{:.3f}".format(specificity))
    print("prevalence:{:.3f}".format(prevalence))

# create the random forest classifier with default parameters
rf_clf = RandomForestClassifier(random_state=111)

# create a dictionary to store the evaluation metrics for each combination of hyperparameters
results = {}

# perform a grid search to find the best combination of hyperparameters
clf = GridSearchCV(rf_clf, param_grid, scoring=make_scorer(roc_auc_score), cv=3, n_jobs=-1)
clf.fit(X_train, y_train)

# print the best hyperparameters and their corresponding scores
print("Best hyperparameters: ", clf.best_params_)
best_clf = clf.best_estimator_
rf_preds_proba = best_clf.predict_proba(X_valid)[:, 1]
print_report(y_valid, rf_preds_proba, 0.5)
Best hyperparameters:  {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}
AUC:0.669
accuracy:0.619
recall:0.622
precision:0.173
specificity:0.619
prevalence:0.113
In [74]:
importances = best_clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%d. %s (%f)" % (f + 1, X_train.columns[indices[f]], importances[indices[f]]))
1. number_inpatient (0.241661)
2. discharge_disposition_id (0.130218)
3. Unnamed: 0 (0.073085)
4. num_medications (0.058445)
5. num_lab_procedures (0.056034)
6. number_emergency (0.054367)
7. number_diagnoses (0.047471)
8. age (0.043715)
9. diag_1 (0.043168)
10. time_in_hospital (0.041978)
11. number_outpatient (0.024163)
12. num_procedures (0.023491)
13. insulin (0.021783)
14. A1Cresult (0.014635)
15. metformin (0.013837)
16. race (0.013692)
17. admission_type_id (0.013684)
18. admission_source_id (0.012857)
19. diabetesMed (0.010361)
20. glipizide (0.009341)
21. change (0.008494)
22. glyburide (0.007365)
23. pioglitazone (0.007250)
24. gender (0.007002)
25. glimepiride (0.005691)
26. rosiglitazone (0.004892)
27. repaglinide (0.004155)
28. nateglinide (0.002187)
29. glyburide-metformin (0.002112)
30. max_glu_serum (0.001936)
31. acarbose (0.000724)
32. chlorpropamide (0.000091)
33. tolazamide (0.000049)
34. miglitol (0.000026)
35. tolbutamide (0.000023)
36. glipizide-metformin (0.000019)
37. examide (0.000000)
38. citoglipton (0.000000)
39. troglitazone (0.000000)
40. glimepiride-pioglitazone (0.000000)
41. metformin-rosiglitazone (0.000000)
42. metformin-pioglitazone (0.000000)
43. acetohexamide (0.000000)